In [1]:
# import relevant modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import glob
import sys
sys.path.append('../scripts/')
from analysis import get_correlation
from scipy import stats
In [2]:
# read only hashtags which are relevant
topics_df = pd.read_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtags = topics_df['hashtag'].tolist()
In [3]:
# load hashtag timeseries
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
hashtag_df.head(3)
Out[3]:
date hashtag count
0 2017-05-29 150jahrekapital 1
1 2017-05-29 a19 1
2 2017-05-29 abschiebung 14
In [4]:
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)
Out[4]:
queryterm party gender
0 wolfgang stefinger CSU male
1 kai whittaker CDU male
2 katrin albsteiger CSU female
In [5]:
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')
In [6]:
# load vector similarites
similarity_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/vector_similarity.json')
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(x))
In [7]:
# join suggestion cluster and  group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'party', 'gender', 'cluster'], as_index=False).sum('count')
suggestions_df.head(3)
Out[7]:
date queryterm party gender cluster count
0 2017-05-29 achim post SPD male 2 4
1 2017-05-29 achim post SPD male 5 12
2 2017-05-29 achim post SPD male 75 4
In [8]:
# remodel similarity cluster to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores']) 
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df.head(3)
Out[8]:
cluster hashtags similarity_scores
0 0 afdwählen 0.015540
1 0 afghanistan 0.005390
2 0 altersarmut 0.020913
In [9]:
# filter everything with sim_score >= 0.5
sim_df = similarity_df[similarity_df['similarity_scores']>=0.4].reset_index(drop=True)

cluster_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_df.rename(columns={'count':'cluster_count'}, inplace=True)

cluster_party_df = suggestions_df.groupby(['date', 'party', 'cluster'], as_index=False).sum('count')
cluster_party_df.rename(columns={'count':'cluster_count'}, inplace=True)

cluster_gender_df = suggestions_df.groupby(['date', 'gender', 'cluster'], as_index=False).sum('count')
cluster_gender_df.rename(columns={'count':'cluster_count'}, inplace=True)

hashtag_df.rename(columns={'count':'hashtag_count'}, inplace=True)
In [10]:
delays = []
for i in range(0, 61, 3):
    delays.append(i)
In [11]:
%load_ext autoreload
%autoreload 2
from analysis import get_correlation
In [12]:
dfs = []
for i in delays:
    dfs.append(get_correlation(i, hashtag_df, cluster_df, cluster_gender_df, cluster_party_df, sim_df))
In [13]:
## set to *.json to load all
#input_loc = '../../data/Analysis/*.json'
#input_files = glob.glob(input_loc)
#
#dfs = []
#for file in input_files:
#    data = pd.read_json(file)
#    dfs.append(data)
In [14]:
for i in range(len(dfs)):
    dfs[i].to_json(f'../../data/Analysis/df_{delays[i]}_delays.json')
In [15]:
colors = px.colors.qualitative.Antique
colors.extend(px.colors.qualitative.Antique)
In [17]:
# scatter plot high performer
fig = go.Figure()

for i in range(len(dfs)):
    fig.add_trace(go.Scatter(x=dfs[i][dfs[i]['pearsonr']>=0.5]['pearsonr'],
                             y=dfs[i][dfs[i]['pearsonr']>=0.5]['similarity_scores'],
                             name=delays[i], mode='markers', marker=dict(color=colors[i])))

fig.update_layout(template='simple_white',
                  font=dict(family='Computer Modern', color='black', size=15))
fig.update_yaxes(title_text='Similarity Score')
fig.update_xaxes(title_text='Korrelation')
fig.show()
In [18]:
mean = []

for i in range(len(dfs)):
    mean.append(dfs[i]['pearsonr'].mean())
    
fig = px.line(x=delays, y=mean, labels={'x':'Delay', 'y':'Mittlere Korrelation'},
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [19]:
plot_dict = {'Delay':[], 'Partei':[], 'Mittlere Korrelation':[]}

for i in range(len(dfs)):
    for party in set(dfs[i]['party']):
        plot_dict['Delay'].append(delays[i])
        plot_dict['Partei'].append(party)
        plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['party']==party]['pearsonr'].mean())    
    
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
              template='simple_white', color='Partei', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [20]:
plot_dict = {'Delay':[], 'Gender':[], 'Mittlere Korrelation':[]}

for i in range(len(dfs)):
    for gender in set(dfs[i]['gender']):
        plot_dict['Delay'].append(delays[i])
        plot_dict['Gender'].append(gender)
        plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['gender']==gender]['pearsonr'].mean())    
    
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
              template='simple_white', color='Gender', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [21]:
plot_dict = {'Delay':[], 'Cluster':[], 'Mittlere Korrelation':[]}

for i in range(len(dfs)):
    for cluster in set(dfs[i]['cluster']):
        plot_dict['Delay'].append(delays[i])
        plot_dict['Cluster'].append(cluster)
        plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['cluster']==cluster]['pearsonr'].mean())    
    
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
              template='simple_white', color='Cluster', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [22]:
plot_dict = {'Delay':[], 'Hashtag':[], 'Mittlere Korrelation':[]}

for i in range(len(dfs)):
    for hashtag in set(dfs[i]['hashtags']):
        plot_dict['Delay'].append(delays[i])
        plot_dict['Hashtag'].append(hashtag)
        plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['hashtags']==hashtag]['pearsonr'].mean())    
    
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
              template='simple_white', color='Hashtag', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()